library(tidyverse)
## ─ Attaching packages ─────────────── tidyverse 1.3.1 ─
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ─ Conflicts ───────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
#STEP 1:
data_2004 <- data.table::fread("~/Desktop/USC/PM566-Labs/Assignment_01/2004.csv")
data_2019 <- data.table::fread("~/Desktop/USC/PM566-Labs/Assignment_01/2019.csv")
dim(data_2004)
## [1] 19233 20
dim(data_2019)
## [1] 53086 20
head(data_2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2004 AQS 60010007 1 11.0 ug/m3 LC
## 2: 01/02/2004 AQS 60010007 1 12.2 ug/m3 LC
## 3: 01/03/2004 AQS 60010007 1 16.5 ug/m3 LC
## 4: 01/04/2004 AQS 60010007 1 18.1 ug/m3 LC
## 5: 01/05/2004 AQS 60010007 1 11.5 ug/m3 LC
## 6: 01/06/2004 AQS 60010007 1 32.5 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 46 Livermore 1 100
## 2: 51 Livermore 1 100
## 3: 60 Livermore 1 100
## 4: 64 Livermore 1 100
## 5: 48 Livermore 1 100
## 6: 94 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 2: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 3: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 4: 88101 PM2.5 - Local Conditions 41860
## 5: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 6: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
head(data_2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2019 AQS 60010007 3 5.7 ug/m3 LC
## 2: 01/02/2019 AQS 60010007 3 11.9 ug/m3 LC
## 3: 01/03/2019 AQS 60010007 3 20.1 ug/m3 LC
## 4: 01/04/2019 AQS 60010007 3 28.8 ug/m3 LC
## 5: 01/05/2019 AQS 60010007 3 11.2 ug/m3 LC
## 6: 01/06/2019 AQS 60010007 3 2.7 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 24 Livermore 1 100
## 2: 50 Livermore 1 100
## 3: 68 Livermore 1 100
## 4: 86 Livermore 1 100
## 5: 47 Livermore 1 100
## 6: 11 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88101 PM2.5 - Local Conditions 41860
## 3: 88101 PM2.5 - Local Conditions 41860
## 4: 88101 PM2.5 - Local Conditions 41860
## 5: 88101 PM2.5 - Local Conditions 41860
## 6: 88101 PM2.5 - Local Conditions 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
tail(data_2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 12/14/2004 AQS 61131003 1 11 ug/m3 LC
## 2: 12/17/2004 AQS 61131003 1 16 ug/m3 LC
## 3: 12/20/2004 AQS 61131003 1 17 ug/m3 LC
## 4: 12/23/2004 AQS 61131003 1 9 ug/m3 LC
## 5: 12/26/2004 AQS 61131003 1 24 ug/m3 LC
## 6: 12/29/2004 AQS 61131003 1 9 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 46 Woodland-Gibson Road 1 100
## 2: 59 Woodland-Gibson Road 1 100
## 3: 61 Woodland-Gibson Road 1 100
## 4: 38 Woodland-Gibson Road 1 100
## 5: 76 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
tail(data_2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 11/11/2019 AQS 61131003 1 13.5 ug/m3 LC
## 2: 11/17/2019 AQS 61131003 1 18.1 ug/m3 LC
## 3: 11/29/2019 AQS 61131003 1 12.5 ug/m3 LC
## 4: 12/17/2019 AQS 61131003 1 23.8 ug/m3 LC
## 5: 12/23/2019 AQS 61131003 1 1.0 ug/m3 LC
## 6: 12/29/2019 AQS 61131003 1 9.1 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 54 Woodland-Gibson Road 1 100
## 2: 64 Woodland-Gibson Road 1 100
## 3: 52 Woodland-Gibson Road 1 100
## 4: 76 Woodland-Gibson Road 1 100
## 5: 4 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
str(data_2004)
## Classes 'data.table' and 'data.frame': 19233 obs. of 20 variables:
## $ Date : chr "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Daily Mean PM2.5 Concentration: num 11 12.2 16.5 18.1 11.5 32.5 14 29.9 21 15.7 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 46 51 60 64 48 94 55 88 70 59 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88502 88502 88502 88101 88502 88502 88101 88502 88502 88101 ...
## $ AQS_PARAMETER_DESC : chr "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(data_2004$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.10 6.00 10.10 13.13 16.30 251.00
table(data_2004$`Daily Mean PM2.5 Concentration`)
##
## -0.1 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 1.1
## 1 11 14 20 27 32 35 41 43 40 31 94 39
## 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4
## 40 37 31 30 43 35 37 35 112 59 44 40 46
## 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7
## 53 58 48 58 64 197 61 63 88 56 73 82 78
## 3.8 3.9 4 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5
## 70 60 365 68 74 74 79 93 63 75 88 80 425
## 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6 6.1 6.2 6.3
## 86 94 98 83 97 75 92 86 104 431 85 96 98
## 6.4 6.5 6.6 6.7 6.8 6.9 7 7.1 7.2 7.3 7.4 7.5 7.6
## 87 100 68 96 81 89 411 86 101 83 91 101 64
## 7.7 7.8 7.9 8 8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9
## 123 97 85 399 70 114 87 98 112 85 98 95 90
## 9 9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9 10 10.1 10.2
## 355 81 105 81 84 100 70 107 91 73 275 98 101
## 10.3 10.4 10.5 10.6 10.7 10.8 10.9 11 11.1 11.2 11.3 11.4 11.5
## 86 71 109 77 95 86 93 268 70 90 96 72 83
## 11.6 11.7 11.8 11.9 12 12.1 12.2 12.3 12.4 12.5 12.6 12.7 12.8
## 75 75 71 60 228 74 76 71 79 75 83 66 72
## 12.9 13 13.1 13.2 13.3 13.4 13.5 13.6 13.7 13.8 13.9 14 14.1
## 55 170 67 74 62 56 68 64 58 59 49 136 63
## 14.2 14.3 14.4 14.5 14.6 14.7 14.8 14.9 15 15.1 15.2 15.3 15.4
## 64 55 52 45 52 70 50 57 136 50 59 57 52
## 15.5 15.6 15.7 15.8 15.9 16 16.1 16.2 16.3 16.4 16.5 16.6 16.7
## 53 46 48 44 53 127 37 51 48 48 50 41 38
## 16.8 16.9 17 17.1 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18
## 47 32 96 37 37 32 46 42 35 37 40 25 87
## 18.1 18.2 18.3 18.4 18.5 18.6 18.7 18.8 18.9 19 19.1 19.2 19.3
## 36 38 24 18 33 39 32 23 29 69 30 17 30
## 19.4 19.5 19.6 19.7 19.8 19.9 20 20.1 20.2 20.3 20.4 20.5 20.6
## 19 40 37 21 18 28 85 27 33 29 20 25 25
## 20.7 20.8 20.9 21 21.1 21.2 21.3 21.4 21.5 21.6 21.7 21.8 21.9
## 24 21 19 65 22 16 17 15 23 19 19 15 11
## 22 22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8 22.9 23 23.1 23.2
## 39 27 16 17 14 22 18 20 18 15 55 22 13
## 23.3 23.4 23.5 23.6 23.7 23.8 23.9 24 24.1 24.2 24.3 24.4 24.5
## 22 20 16 17 16 20 12 47 19 19 14 6 17
## 24.6 24.7 24.8 24.9 25 25.1 25.2 25.3 25.4 25.5 25.6 25.7 25.8
## 13 14 12 14 53 18 14 14 11 16 13 18 7
## 25.9 26 26.1 26.2 26.3 26.4 26.5 26.6 26.7 26.8 26.9 27 27.1
## 12 50 16 8 14 13 14 17 10 9 12 43 14
## 27.2 27.3 27.4 27.5 27.6 27.7 27.8 27.9 28 28.1 28.2 28.3 28.4
## 22 8 3 13 7 6 11 10 38 23 20 14 10
## 28.5 28.6 28.7 28.8 28.9 29 29.1 29.2 29.3 29.4 29.5 29.6 29.7
## 9 10 8 7 17 30 12 10 12 6 15 21 13
## 29.8 29.9 30 30.1 30.2 30.3 30.4 30.5 30.6 30.7 30.8 30.9 31
## 11 10 39 14 15 11 9 10 11 13 10 5 29
## 31.1 31.2 31.3 31.4 31.5 31.6 31.7 31.8 31.9 32 32.1 32.2 32.3
## 6 16 4 10 7 1 8 16 8 27 7 12 12
## 32.4 32.5 32.6 32.7 32.8 32.9 33 33.1 33.2 33.3 33.4 33.5 33.6
## 9 10 9 7 11 10 31 4 4 10 12 12 6
## 33.7 33.8 33.9 34 34.1 34.2 34.3 34.4 34.5 34.6 34.7 34.8 34.9
## 11 7 4 35 3 7 4 11 7 7 4 5 9
## 35 35.1 35.2 35.3 35.4 35.5 35.6 35.7 35.8 35.9 36 36.1 36.2
## 23 7 15 5 5 8 6 5 3 2 24 7 9
## 36.3 36.4 36.5 36.6 36.7 36.8 36.9 37 37.1 37.2 37.3 37.4 37.5
## 3 5 8 6 8 7 5 19 5 5 8 10 4
## 37.6 37.7 37.8 37.9 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8
## 5 4 7 3 24 3 8 4 7 7 3 6 5
## 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 40 40.1
## 3 17 6 7 3 6 4 6 6 3 4 7 8
## 40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 41 41.1 41.2 41.3 41.4
## 6 4 8 11 1 3 7 3 12 5 7 5 2
## 41.5 41.6 41.7 41.8 41.9 42 42.1 42.2 42.3 42.4 42.5 42.6 42.7
## 9 3 5 3 5 10 5 6 6 5 4 5 5
## 42.9 43 43.1 43.2 43.3 43.4 43.5 43.6 43.7 43.8 43.9 44 44.1
## 8 11 5 6 6 7 3 2 5 1 3 12 4
## 44.2 44.3 44.4 44.5 44.6 44.7 44.8 44.9 45 45.1 45.2 45.3 45.4
## 3 4 2 2 6 4 3 5 6 1 1 1 4
## 45.5 45.7 45.8 45.9 46 46.1 46.2 46.3 46.4 46.5 46.6 46.7 46.8
## 4 7 3 3 3 3 2 1 3 7 2 4 1
## 46.9 47 47.1 47.2 47.3 47.5 47.6 47.7 47.8 47.9 48 48.1 48.2
## 5 8 2 5 2 2 3 3 4 2 3 1 2
## 48.3 48.5 48.6 48.7 48.9 49 49.2 49.3 49.4 49.5 49.6 49.7 49.9
## 2 3 1 4 2 5 1 3 5 1 2 5 1
## 50 50.1 50.2 50.4 50.5 50.6 50.8 50.9 51 51.2 51.4 51.5 51.7
## 4 4 1 2 1 3 1 2 5 4 1 3 2
## 51.8 51.9 52 52.1 52.2 52.4 52.5 52.7 52.8 52.9 53 53.1 53.2
## 1 1 4 2 1 2 3 1 1 3 5 1 2
## 53.3 53.5 53.7 53.8 53.9 54 54.2 54.3 54.4 54.6 54.8 54.9 55
## 1 1 1 3 1 2 2 2 1 2 2 2 3
## 55.1 55.2 55.3 55.5 55.6 55.7 55.8 56 56.1 56.2 56.3 56.4 56.8
## 1 1 1 2 1 1 3 1 1 2 1 3 1
## 57 57.2 57.3 57.4 57.9 58.1 58.4 58.7 58.9 59.1 59.2 59.3 59.4
## 4 1 3 1 1 1 2 1 2 1 2 1 1
## 59.5 59.7 59.9 60 60.1 60.3 60.4 60.5 60.7 60.8 60.9 61 61.2
## 2 2 2 2 1 1 1 1 1 2 1 3 1
## 61.5 61.7 61.8 62.5 62.6 62.7 63.1 63.4 63.9 64 64.9 65 65.3
## 1 1 2 2 1 1 1 1 1 1 1 2 1
## 65.4 66.1 66.3 66.6 67.1 67.3 67.4 68.2 68.6 68.7 68.9 69 69.3
## 2 3 2 2 1 1 3 1 1 1 1 2 1
## 70 70.6 71 71.4 72.4 72.8 73.6 73.7 74.2 74.5 75 75.6 76.8
## 1 2 1 2 1 2 2 1 1 1 1 1 1
## 77.1 77.5 79.8 80.9 81 81.4 81.6 81.9 82.3 83 86.1 90.2 90.7
## 1 1 1 1 1 1 2 1 1 2 1 1 1
## 90.9 91.7 93.4 93.8 95.7 100.4 102.1 110.4 122.5 148.4 170.4 251
## 1 1 1 1 1 1 1 1 1 1 1 1
I think that the concentration of PM2.5 should not be negative, so I need to remove all the data that is negative.
data_2004 <- data_2004[`Daily Mean PM2.5 Concentration`>=0]
summary(data_2004$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 10.10 13.13 16.30 251.00
str(data_2019)
## Classes 'data.table' and 'data.frame': 53086 obs. of 20 variables:
## $ Date : chr "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Daily Mean PM2.5 Concentration: num 5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 24 50 68 86 47 11 12 29 13 30 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(data_2019$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.200 4.000 6.500 7.734 9.900 120.900
table(data_2019$`Daily Mean PM2.5 Concentration`)
##
## -2.2 -2 -1.9 -1.8 -1.7 -1.6 -1.5 -1.4 -1.3 -1.2 -1.1 -1 -0.9
## 1 12 16 11 12 12 8 11 10 16 9 12 10
## -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1 0 0.1 0.2 0.3 0.4
## 11 15 9 15 13 24 29 26 70 39 69 84 82
## 0.5 0.6 0.7 0.8 0.9 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7
## 126 136 160 142 160 359 185 214 204 223 294 238 333
## 1.8 1.9 2 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3
## 271 289 450 271 411 365 310 482 383 470 397 397 562
## 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1 4.2 4.3
## 403 525 459 400 587 458 590 452 471 720 467 603 512
## 4.4 4.5 4.6 4.7 4.8 4.9 5 5.1 5.2 5.3 5.4 5.5 5.6
## 490 616 522 616 503 480 723 471 598 509 491 596 486
## 5.7 5.8 5.9 6 6.1 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9
## 600 495 452 582 445 537 462 451 577 394 552 427 425
## 7 7.1 7.2 7.3 7.4 7.5 7.6 7.7 7.8 7.9 8 8.1 8.2
## 542 374 446 411 416 486 400 490 389 367 484 331 407
## 8.3 8.4 8.5 8.6 8.7 8.8 8.9 9 9.1 9.2 9.3 9.4 9.5
## 372 306 455 317 390 329 303 427 293 389 306 305 389
## 9.6 9.7 9.8 9.9 10 10.1 10.2 10.3 10.4 10.5 10.6 10.7 10.8
## 253 340 296 253 338 229 317 269 253 306 229 254 241
## 10.9 11 11.1 11.2 11.3 11.4 11.5 11.6 11.7 11.8 11.9 12 12.1
## 205 252 224 257 186 199 229 196 230 152 140 233 139
## 12.2 12.3 12.4 12.5 12.6 12.7 12.8 12.9 13 13.1 13.2 13.3 13.4
## 177 151 175 179 141 168 122 134 169 132 153 138 110
## 13.5 13.6 13.7 13.8 13.9 14 14.1 14.2 14.3 14.4 14.5 14.6 14.7
## 157 125 142 102 117 132 84 129 83 112 107 88 106
## 14.8 14.9 15 15.1 15.2 15.3 15.4 15.5 15.6 15.7 15.8 15.9 16
## 89 82 122 86 114 65 90 88 72 87 73 67 69
## 16.1 16.2 16.3 16.4 16.5 16.6 16.7 16.8 16.9 17 17.1 17.2 17.3
## 72 75 50 57 69 60 60 52 38 74 52 54 36
## 17.4 17.5 17.6 17.7 17.8 17.9 18 18.1 18.2 18.3 18.4 18.5 18.6
## 40 54 40 52 41 42 33 42 46 38 30 52 41
## 18.7 18.8 18.9 19 19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9
## 36 35 35 36 30 34 34 41 43 32 28 26 31
## 20 20.1 20.2 20.3 20.4 20.5 20.6 20.7 20.8 20.9 21 21.1 21.2
## 40 29 28 18 26 24 21 38 29 21 41 20 26
## 21.3 21.4 21.5 21.6 21.7 21.8 21.9 22 22.1 22.2 22.3 22.4 22.5
## 16 22 21 11 14 17 10 33 21 20 14 16 24
## 22.6 22.7 22.8 22.9 23 23.1 23.2 23.3 23.4 23.5 23.6 23.7 23.8
## 19 12 16 11 20 14 18 17 19 20 11 5 13
## 23.9 24 24.1 24.2 24.3 24.4 24.5 24.6 24.7 24.8 24.9 25 25.1
## 16 12 12 12 10 9 11 15 18 11 10 11 10
## 25.2 25.3 25.4 25.5 25.6 25.7 25.8 25.9 26 26.1 26.2 26.3 26.4
## 6 5 12 15 3 11 10 6 11 8 8 10 9
## 26.5 26.6 26.7 26.8 26.9 27 27.1 27.2 27.3 27.4 27.5 27.6 27.7
## 10 10 10 8 5 12 6 11 8 14 4 8 6
## 27.8 27.9 28 28.1 28.2 28.3 28.4 28.5 28.6 28.7 28.8 28.9 29
## 9 2 15 9 11 8 2 9 6 10 5 5 6
## 29.1 29.2 29.3 29.4 29.5 29.6 29.7 29.8 29.9 30 30.1 30.2 30.3
## 6 6 7 6 8 10 11 9 7 11 2 4 3
## 30.4 30.5 30.6 30.7 30.8 30.9 31 31.1 31.2 31.3 31.4 31.5 31.6
## 4 12 12 8 7 10 7 9 10 4 6 8 5
## 31.7 31.8 31.9 32 32.1 32.2 32.3 32.4 32.5 32.6 32.7 32.8 32.9
## 7 2 4 3 3 4 4 3 4 3 7 7 5
## 33 33.1 33.2 33.3 33.4 33.5 33.6 33.7 33.8 33.9 34 34.1 34.2
## 3 8 8 2 4 6 2 4 5 4 1 3 5
## 34.3 34.4 34.5 34.6 34.7 34.8 34.9 35 35.1 35.2 35.3 35.4 35.5
## 1 4 4 4 3 5 2 1 4 1 3 3 4
## 35.6 35.7 35.8 35.9 36 36.1 36.2 36.3 36.4 36.7 36.8 36.9 37
## 3 2 2 3 3 2 5 6 7 2 2 3 1
## 37.1 37.2 37.3 37.4 37.5 37.6 37.7 37.8 37.9 38 38.1 38.3 38.4
## 7 3 1 1 3 1 1 3 3 1 2 1 2
## 38.5 38.6 38.7 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8
## 2 3 1 2 5 2 3 1 1 5 2 3 1
## 39.9 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.9 41 41.1 41.2
## 2 2 3 2 4 1 1 1 3 5 1 4 2
## 41.3 41.4 41.5 41.6 41.7 41.8 41.9 42.2 42.3 42.8 43.1 43.3 43.4
## 2 3 1 2 1 1 1 1 1 1 2 1 3
## 43.5 43.6 44 44.2 44.3 44.4 44.5 44.7 44.8 45.1 45.3 45.4 45.5
## 1 1 1 2 2 1 1 1 1 1 1 1 1
## 45.7 45.8 46 46.3 46.4 46.5 46.7 47.1 47.2 47.4 47.5 47.9 48
## 1 1 1 1 4 1 3 3 1 1 1 1 1
## 48.1 48.2 48.8 49 49.3 49.4 49.6 50.1 50.2 50.6 50.7 50.9 51.3
## 1 1 1 1 1 2 1 1 1 2 2 2 1
## 52.3 52.4 53 53.1 54.7 55.7 57 57.6 57.7 58.2 58.8 59.1 60.4
## 1 1 2 2 1 1 1 2 1 1 1 1 1
## 60.5 62.2 62.6 63.4 66.1 68.4 68.5 70.1 70.3 71.2 73.9 75.1 77.3
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 77.4 81.3 83.7 91.1 97.3 98.9 103.5 104.5 120.9
## 1 1 1 1 1 1 1 1 1
Same as 2004 data, I remove all the data that is less than 0
data_2019 <- data_2019[`Daily Mean PM2.5 Concentration`>=0]
summary(data_2019$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 4.00 6.50 7.78 10.00 120.90
data_2004[, Year := "2004"]
data_2019[, Year := "2019"]
all_data = rbind(data_2004, data_2019)
all_data = rename(all_data, concentration = `Daily Mean PM2.5 Concentration`)
all_data = rename(all_data, lat = SITE_LATITUDE)
all_data = rename(all_data, lon = SITE_LONGITUDE)
library(leaflet)
pal <- colorFactor(c("Blue","DarkOrange"), domain = all_data$Year)
leaflet(all_data) %>%
addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lat=~lat,lng=~lon, opacity=1, fillOpacity=1, radius=1,color = ~pal(all_data$Year))
The distribution of sites is radial centralized around two sites: Los Angeles and San Francisco. And more are located near the coastline than the inner land.
sum(is.na(all_data$Date))
## [1] 0
sum(is.na(all_data$concentration))
## [1] 0
sum(is.na(all_data$`Site Name`))
## [1] 0
sum(is.na(all_data$COUNTY))
## [1] 0
sum(is.na(all_data$STATE))
## [1] 0
length(all_data$`Site Name`)
## [1] 72036
p = 331/72036; p
## [1] 0.004594925
In step 1, I have deleted all implausible pm2.5 concentration which is less than 0. And for this section, I find that 0.45% site names are missinng, so the plots based on site in Los Angeles may be less convincing.
library(ggforce)
ggplot(
all_data[!is.na(concentration)],
mapping = aes( x = concentration, fill = Year)) +
geom_histogram() +
facet_zoom(x = concentration <70)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
From the graph, we can see that generally for California state, the concentration of PM2.5 is dropped from year 2004 to year 2019
ggplot(
all_data[!is.na(concentration)],
mapping = aes( y = concentration, x = COUNTY, color = Year)) +
geom_point() +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=1))
From the graph, we can see that PM2.5 concentration decreases from 2004 to 2019 in most counties in California, except 8 counties such as Butte, Contra Costa and Manposa.
LA_data = filter(all_data, COUNTY == "Los Angeles")
ggplot(
LA_data[!is.na(concentration)],
mapping = aes( y = concentration, x = `Site Name`, fill = Year)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=1))
Based on sites in Los Angeles, excluding some sites with missing data, we can see that PM2.5 concentration decreases from 2004 to 2019.